## Clear R-workspace
rm(list=ls(all=TRUE))
## Close all graphic devices
graphics.off()
#####################
### Load packages ###
#####################
library(pacman)
pacman::p_load(extrafont,DT,stringr,dplyr,plyr,tibble,tidyverse,data.table, ggplot2,strex,Hmisc,hrbrthemes,
ggpubr,mlbench,Amelia,caret, DMwR, DataExplorer,FactoMineR,OutlierDetection,factoextra, parallel,doParallel,scales,precrec,
RColorBrewer, pander, rlist,
reconPlots,
survival,
survminer, strex, ggplotify,flextable,ggsci, naniar,viridis,cowplot,ggthemes,dichromat)
extrafont::loadfonts(device="win")
windowsFonts(sans="Palatino Linotype")
loadfonts(device="win")
loadfonts(device="postscript")
###########################
### Main analysis paths ###
###########################
# Main
scriptsPath <- paste0("scripts/")
scriptsFunctionsPath <- paste0(scriptsPath,"functions/")
projectDataPath <- paste0("data/")
# Input
tcgaInputData <- paste0(projectDataPath,"TCGA/")
antiPDL1publicInputData <- paste0(projectDataPath,"antiPD(L)1_public/")
otherInputData <- paste0(projectDataPath,"other/")
referenceInputData <- paste0(projectDataPath,"reference/")
# # Output
projectOutDataPath <- paste0("output/data_files/")
# tcgaIntermediateData <- paste0(projectOutDataPath,"TCGA/")
antiPDL1publicIntermediateData <- paste0(projectOutDataPath,"antiPD(L)1_public/")
figuresPath <- paste0("output/figures/")
# tablesPath <- paste0("output/tables/")
# modelsPath <- paste0("output/models/")
# # #Intermediate output
# testDataOut <-paste0(modelsPath,"test_data/")
# rocCompareOut <-paste0(modelsPath,"roc_compare/")
# modelsInterm <- paste0(modelsPath,"models_interm_files/")
# Session/dependencies
sessionInfoPath <- paste0("session_info/")
##########################
### FOLDER/FILES NAMES ###
##########################
# Create list with folders names for the different datasets.
datasets <- c("Hugo","Riaz","Gide","Liu","Rod","Miao","Immotion150","Kim","Mari", "Powles","Braun")
data.foldersList <- as.list(c("Hugo_MEL","Riaz_MEL","Gide_MEL", "Liu_MEL","Rodriguez_MEL","Miao_RCC","Immo150_RCC","Kim_GAS","Mari_BLA","Powles_BLA","Braun_RCC"))
names(data.foldersList)<- c("Hugo","Riaz","Gide", "Liu","Rod", "Miao","Immotion150","Kim","Mari","Powles","Braun")
##################################
### LOAD SOURCE FUNCTIONS FILE ###
##################################
source(paste0(scriptsFunctionsPath,"MissingData_functions.R"), local=T)
#####################
### File suffixes ###
#####################
Rdata.suffix <- ".RData"
# Other script variables
# analysis_type <- "_TCRaIGH_noLiu"
modelTypeA <- "_TCRArichnessIGH.ds"
modelTypeB <- "_noLIU"
modelTypeC <- "_mintcr4bcr10"
We are using the data that were TCRa min 4 clones, BCR igh min 10 clones
immdata.TcrBcr.full.red <- loadRData(paste0(antiPDL1publicIntermediateData,"immdata.TcrBcr.full.red",modelTypeA,modelTypeB,modelTypeC,".RData"))
immdata.TcrBcr.full.red$response <- ifelse(immdata.TcrBcr.full.red$response=="CR+PR","CRPR",immdata.TcrBcr.full.red$response)
First filtering, to keep only CR, PR, PD samples
## FILTER DATA FOR PD, CR, PR (.sub)
immdata.TcrBcr.full.red.sub <- immdata.TcrBcr.full.red %>% filter(response %in% c("PD","CRPR")) %>% droplevels()
From 695 samples (that include all responses), we drop down to 519, that are CR, PR, PD pre-treatment samples.
Looking at how many of the data have TMB info
datatable(immdata.TcrBcr.full.red.sub %>% select(run_accession,response,tissue) %>% group_by(tissue,response) %>% dplyr::summarise(no_rows=length(response)), extensions = 'Buttons', options = list(
dom = 'Bfrtip',
buttons = c('copy', 'excel', 'csv' ),
scrollX=TRUE,
pageLength=15
),
caption = 'Number of responders/non-responders in all tissues'
)
Looking at how many of the data have TMB info
datatable(immdata.TcrBcr.full.red.sub %>% filter(complete.cases(TMB)) %>% select(run_accession,response,tissue) %>% group_by(tissue,response) %>% dplyr::summarise(no_rows=length(response)), extensions = 'Buttons', options = list(
dom = 'Bfrtip',
buttons = c('copy', 'excel', 'csv' ),
scrollX=TRUE,
pageLength=15
),
caption = 'Number of responders/non-responders in all tissues that have TMB'
)
From 519 CRPR, PD samples, we drop down to 373, that are CR, PR, PD pre-treatment samples. The only tissues we can actually use for the model are melanoma and bladder, that have more than 100 samples.
Prepare other variables and dataframes necessary for the analysis.
## EXTRACT RESPONSE, NOMINAL, MEASURE VARIABLES FROM DATA
response.var <- "response"
nominal.vars <- c("dataset","tissue","gender")
measure.vars <-colnames(immdata.TcrBcr.full.red.sub)[c(2:10,12,32:43)]
measure.vars <- measure.vars[c(6,22,1:3,20,21,4:5,10,7:9,11:19)]
## CHECK INFINITE LOG TMB VALUES
immdata.TcrBcr.full.red.sub[which(is.infinite(immdata.TcrBcr.full.red.sub$logTMB)),]
## [1] run_accession TuTACK_sigscore TIS_sigscore PDL1
## [5] CD8A CD8B TCRArichness.ds StromalScore
## [9] ImmuneScore ESTIMATEScore title age
## [13] gender disease_status biopsy_site primary_tumor
## [17] treatment prior_treatment response OS.time
## [21] OS PFS.time PFS pre_on_treatment
## [25] total_muts nonsyn_muts clonal_muts subclonal_muts
## [29] TMB tissue dataset Bcells
## [33] CD8cells CD4cells Tregs NKcells
## [37] DCcells Monocytes Granulocytes Macrophages
## [41] TumorPurity logTMB BCR.IGHrichness.ds
## <0 rows> (or 0-length row.names)
## Extract tissues in data so far
tissues <- unique(immdata.TcrBcr.full.red.sub$tissue)
# Keep samples with TMB (.tmb)
immdata.TcrBcr.full.red.sub.tmb <- immdata.TcrBcr.full.red.sub %>% filter(complete.cases(logTMB)) %>% droplevels()#;nrow(immdata.TcrBcr.full.red.sub.tmb)
# ONLY MELANOMA, WITH TMB (.tmb.MEL)
immdata.TcrBcr.full.red.sub.tmb.MEL <- immdata.TcrBcr.full.red.sub.tmb %>% filter(tissue=="melanoma") %>% droplevels()#;nrow(immdata.TcrBcr.full.red.sub.tmb.MEL)
## ALL MELANOMA AND BLADDER, WITH TMB (.tmb.ALL)
immdata.TcrBcr.full.red.sub.tmb.ALL <- immdata.TcrBcr.full.red.sub.tmb %>% filter(tissue %in% c("melanoma","bladder")) %>% droplevels()#;nrow(immdata.TcrBcr.full.red.sub.tmb.ALL)
# ALL DATA, MELANOMA AND BLADDER, BUT LEAVE MISSING TMB (.ALL)
immdata.TcrBcr.full.red.sub.ALL <- immdata.TcrBcr.full.red.sub %>% filter(tissue %in% c("melanoma","bladder")) %>% droplevels()#;nrow(immdata.TcrBcr.full.red.sub.ALL)
# Which are the covariates I am considering
covariatesIn <- measure.vars[c(1:2,4:7,14:16)]
responseVar <- "response"
## INSTEAD OF FILTERING FOR NO MISSING TMB, I DO THAT FOR ALL COVARIATES OF INTEREST
# Remove rows that have outliers/missing values since I get errors with Knn imputation
immdata.TcrBcr.full.red.sub.ALL.filt <- immdata.TcrBcr.full.red.sub.ALL %>% filter_at(vars(covariatesIn), all_vars(complete.cases(.))) %>% droplevels()
# DOING THE SAME FOR THE ORIGINAL DATA - USE THIS WHEN TESTING
# these data include other tissues apart from melanoma and bladder
immdata.TcrBcr.full.red.sub.filt <- immdata.TcrBcr.full.red.sub %>% filter_at(vars(covariatesIn), all_vars(complete.cases(.))) %>% droplevels()
# FINAL DATA FOR MODELLING
# Remove the Liu dataset for the model testing
immdata.TcrBcr.BLAD.MEL.ready <-immdata.TcrBcr.full.red.sub.ALL.filt %>% filter(dataset %notin% c("Liu") ) %>% droplevels() # All melanoma & bladder, PD, CR, PR, complete data
immdata.TcrBcr.ALL.ready <- immdata.TcrBcr.full.red.sub.filt %>% filter(dataset %notin% c("Liu") ) %>% droplevels() # All tissues, PD, CR, PR, complete data
## For gastric
immdata.TcrBcr.full.red.sub.filt.gastric <- immdata.TcrBcr.full.red.sub %>% filter(tissue %in% c("gastric") ) %>%
filter_at(vars(covariatesIn[-6]), all_vars(complete.cases(.))) %>% droplevels()
## covariatesIn,"OS","OS.time","age","gender","disease_status"
factors <- c(covariatesIn,"OS","OS.time","age","gender","disease_status","tissue","dataset")#,"prior_treatment"
df.full <- immdata.TcrBcr.full.red %>% select(all_of(factors)) %>% droplevels()
df.full$gender <- ifelse(df.full$gender=="",NA,df.full$gender)
df.full$disease_status <- ifelse(df.full$disease_status=="",NA,
ifelse(df.full$disease_status=="Unknown",NA,df.full$disease_status))
# Are there missing values in the dataset?
any_na(df.full)
[1] TRUE
# How many?
n_miss(df.full)
[1] 1215
prop_miss(df.full)
[1] 0.1092626
# Which variables are affected?
df.full %>% is.na() %>% colSums() %>% pander()
| TCRArichness.ds | BCR.IGHrichness.ds | TIS_sigscore | PDL1 | TumorPurity |
|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 |
| logTMB | Bcells | CD8cells | CD4cells | OS | OS.time | age | gender |
|---|---|---|---|---|---|---|---|
| 198 | 0 | 0 | 0 | 128 | 130 | 496 | 87 |
| disease_status | tissue | dataset |
|---|---|---|
| 176 | 0 | 0 |
datatable(miss_var_summary(df.full), extensions = 'Buttons', options = list(
dom = 'Bfrtip',
buttons = c('copy', 'excel', 'csv' ),
scrollX=TRUE,
pageLength=15
),
caption = 'number of missings per variable'
)
datatable(miss_var_table(df.full), extensions = 'Buttons', options = list(
dom = 'Bfrtip',
buttons = c('copy', 'excel', 'csv' ),
scrollX=TRUE,
pageLength=15
),
caption = 'Tabulate the missings in the variables'
)
datatable(miss_case_summary(df.full), extensions = 'Buttons', options = list(
dom = 'Bfrtip',
buttons = c('copy', 'excel', 'csv' ),
scrollX=TRUE,
pageLength=15
),
caption = 'number of missings per participant (n and %)'
)
datatable(miss_case_table(df.full), extensions = 'Buttons', options = list(
dom = 'Bfrtip',
buttons = c('copy', 'excel', 'csv' ),
scrollX=TRUE,
pageLength=15
),
caption = 'number of missings per participant (n and %)'
)
# Get number of missings per participant (n and %)
gg_miss_var(df.full)
vis_miss(df.full) + theme(axis.text.x = element_text(angle=80))
gg_miss_upset(df.full)
# Which variables are affected?
df.full %>% dplyr::filter(tissue=="melanoma") %>% droplevels() %>% is.na() %>% colSums() %>% pander()
| TCRArichness.ds | BCR.IGHrichness.ds | TIS_sigscore | PDL1 | TumorPurity |
|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 |
| logTMB | Bcells | CD8cells | CD4cells | OS | OS.time | age | gender |
|---|---|---|---|---|---|---|---|
| 68 | 0 | 0 | 0 | 0 | 2 | 92 | 3 |
| disease_status | tissue | dataset |
|---|---|---|
| 44 | 0 | 0 |
# Get number of missings per participant (n and %)
gg_miss_var(df.full %>% dplyr::filter(tissue=="melanoma") %>% droplevels())
vis_miss(df.full %>% dplyr::filter(tissue=="melanoma") %>% droplevels()) + theme(axis.text.x = element_text(angle=80))
gg_miss_upset(df.full %>% dplyr::filter(tissue=="melanoma") %>% droplevels())
sapply(colnames(df.full)[1:9], function(x) ggplot(df.full %>% dplyr::filter(tissue=="melanoma") %>% droplevels(), aes_string(x = "dataset", y = x)) + geom_boxplot() + geom_jitter(size=0.9, alpha=0.9,aes_string(x = "dataset", y = x, color="disease_status")), simplify = FALSE)
$TCRArichness.ds
$BCR.IGHrichness.ds
$TIS_sigscore
$PDL1
$TumorPurity
$logTMB
$Bcells
$CD8cells
$CD4cells
sapply(colnames(df.full)[1:9], function(x) ggplot(df.full %>% dplyr::filter(tissue=="melanoma") %>% droplevels(), aes_string(x = "dataset", y = x)) + geom_boxplot() + geom_jitter(size=0.9, alpha=0.9,aes_string(x = "dataset", y = x, color="gender")), simplify = FALSE)
$TCRArichness.ds
$BCR.IGHrichness.ds
$TIS_sigscore
$PDL1
$TumorPurity
$logTMB
$Bcells
$CD8cells
$CD4cells
sapply(colnames(df.full)[1:9], function(x) ggplot(df.full %>% dplyr::filter(tissue=="melanoma") %>% droplevels(), aes_string(x = "dataset", y = x)) + geom_boxplot() + geom_jitter(size=0.9, alpha=0.9,aes_string(x = "dataset", y = x, color="age")) + scale_color_viridis(alpha=0.6), simplify = FALSE)
$TCRArichness.ds
$BCR.IGHrichness.ds
$TIS_sigscore
$PDL1
$TumorPurity
$logTMB
$Bcells
$CD8cells
$CD4cells
sapply(colnames(df.full)[1:9], function(x) ggplot(df.full %>% dplyr::filter(tissue=="melanoma") %>% droplevels(), aes_string(x = "dataset", y = x)) + geom_boxplot() + geom_jitter(size=0.9, alpha=0.9,aes_string(x = "dataset", y = x, color="prior_treatment")), simplify = FALSE)
df.full.mel <- df.full %>% dplyr::filter(tissue=="melanoma") %>% droplevels() %>% group_split(dataset)
# Which variables are affected?
df.full.mel %>% map(is.na) %>% map(colSums)
[[1]] TCRArichness.ds BCR.IGHrichness.ds TIS_sigscore PDL1 0 0 0 0 TumorPurity logTMB Bcells CD8cells 0 38 0 0 CD4cells OS OS.time age 0 0 0 0 gender disease_status tissue dataset 0 38 0 0
[[2]] TCRArichness.ds BCR.IGHrichness.ds TIS_sigscore PDL1 0 0 0 0 TumorPurity logTMB Bcells CD8cells 0 0 0 0 CD4cells OS OS.time age 0 0 1 0 gender disease_status tissue dataset 0 0 0 0
[[3]] TCRArichness.ds BCR.IGHrichness.ds TIS_sigscore PDL1 0 0 0 0 TumorPurity logTMB Bcells CD8cells 0 0 0 0 CD4cells OS OS.time age 0 0 0 89 gender disease_status tissue dataset 0 0 0 0
[[4]] TCRArichness.ds BCR.IGHrichness.ds TIS_sigscore PDL1 0 0 0 0 TumorPurity logTMB Bcells CD8cells 0 3 0 0 CD4cells OS OS.time age 0 0 0 3 gender disease_status tissue dataset 3 6 0 0
[[5]] TCRArichness.ds BCR.IGHrichness.ds TIS_sigscore PDL1 0 0 0 0 TumorPurity logTMB Bcells CD8cells 0 27 0 0 CD4cells OS OS.time age 0 0 1 0 gender disease_status tissue dataset 0 0 0 0
df.full.mel %>% map(gg_miss_var)
[[1]]
[[2]]
[[3]]
[[4]]
[[5]]
df.full.mel %>% map(vis_miss) #%>% map(theme(axis.text.x = element_text(angle=80)))
[[1]]
[[2]]
[[3]]
[[4]]
[[5]]
#df.full.mel %>% map(gg_miss_upset)
# Which variables are affected?
df.full %>% dplyr::filter(tissue=="bladder") %>% droplevels() %>% is.na() %>% colSums() %>% pander()
| TCRArichness.ds | BCR.IGHrichness.ds | TIS_sigscore | PDL1 | TumorPurity |
|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 |
| logTMB | Bcells | CD8cells | CD4cells | OS | OS.time | age | gender |
|---|---|---|---|---|---|---|---|
| 58 | 0 | 0 | 0 | 68 | 68 | 344 | 68 |
| disease_status | tissue | dataset |
|---|---|---|
| 98 | 0 | 0 |
# Get number of missings per participant (n and %)
gg_miss_var(df.full %>% dplyr::filter(tissue=="bladder") %>% droplevels())
vis_miss(df.full %>% dplyr::filter(tissue=="bladder") %>% droplevels()) + theme(axis.text.x = element_text(angle=80))
gg_miss_upset(df.full %>% dplyr::filter(tissue=="bladder") %>% droplevels())
sapply(colnames(df.full)[1:9], function(x) ggplot(df.full %>% dplyr::filter(tissue=="bladder") %>% droplevels(), aes_string(x = "dataset", y = x)) + geom_boxplot(), simplify = FALSE)
$TCRArichness.ds
$BCR.IGHrichness.ds
$TIS_sigscore
$PDL1
$TumorPurity
$logTMB
$Bcells
$CD8cells
$CD4cells
sapply(colnames(df.full)[1:9], function(x) ggplot(df.full %>% dplyr::filter(tissue=="bladder") %>% droplevels(), aes_string(x = "dataset", y = x)) + geom_boxplot() + geom_jitter(size=0.9, alpha=0.9,aes_string(x = "dataset", y = x, color="gender")), simplify = FALSE)
$TCRArichness.ds
$BCR.IGHrichness.ds
$TIS_sigscore
$PDL1
$TumorPurity
$logTMB
$Bcells
$CD8cells
$CD4cells
sapply(colnames(df.full)[1:9], function(x) ggplot(df.full %>% dplyr::filter(tissue=="bladder") %>% droplevels(), aes_string(x = "dataset", y = x)) + geom_boxplot() + geom_jitter(size=0.9, alpha=0.9,aes_string(x = "dataset", y = x, color="age")) + scale_color_viridis(alpha=0.6), simplify = FALSE)
$TCRArichness.ds
$BCR.IGHrichness.ds
$TIS_sigscore
$PDL1
$TumorPurity
$logTMB
$Bcells
$CD8cells
$CD4cells
df.full.blad <- df.full %>% dplyr::filter(tissue=="bladder") %>% droplevels() %>% group_split(dataset)
# Which variables are affected?
df.full.blad %>% map(is.na) %>% map(colSums)
[[1]] TCRArichness.ds BCR.IGHrichness.ds TIS_sigscore PDL1 0 0 0 0 TumorPurity logTMB Bcells CD8cells 0 57 0 0 CD4cells OS OS.time age 0 0 0 276 gender disease_status tissue dataset 0 30 0 0
[[2]] TCRArichness.ds BCR.IGHrichness.ds TIS_sigscore PDL1 0 0 0 0 TumorPurity logTMB Bcells CD8cells 0 1 0 0 CD4cells OS OS.time age 0 68 68 68 gender disease_status tissue dataset 68 68 0 0
df.full.blad %>% map(gg_miss_var)
[[1]]
[[2]]
df.full.blad %>% map(vis_miss) #%>% map(theme(axis.text.x = element_text(angle=80)))
[[1]]
[[2]]
df.full.blad %>% map(gg_miss_upset)
[[1]]
[[2]]
# Which variables are affected?
df.full %>% dplyr::filter(tissue=="RCC") %>% droplevels() %>% is.na() %>% colSums() %>% pander()
| TCRArichness.ds | BCR.IGHrichness.ds | TIS_sigscore | PDL1 | TumorPurity |
|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 |
| logTMB | Bcells | CD8cells | CD4cells | OS | OS.time | age | gender |
|---|---|---|---|---|---|---|---|
| 29 | 0 | 0 | 0 | 60 | 60 | 60 | 16 |
| disease_status | tissue | dataset |
|---|---|---|
| 34 | 0 | 0 |
# Get number of missings per participant (n and %)
gg_miss_var(df.full %>% dplyr::filter(tissue=="RCC") %>% droplevels())
vis_miss(df.full %>% dplyr::filter(tissue=="RCC") %>% droplevels()) + theme(axis.text.x = element_text(angle=80))
gg_miss_upset(df.full %>% dplyr::filter(tissue=="RCC") %>% droplevels())
sapply(colnames(df.full)[1:9], function(x) ggplot(df.full %>% dplyr::filter(tissue=="RCC") %>% droplevels(), aes_string(x = "dataset", y = x)) + geom_boxplot(), simplify = FALSE)
$TCRArichness.ds
$BCR.IGHrichness.ds
$TIS_sigscore
$PDL1
$TumorPurity
$logTMB
$Bcells
$CD8cells
$CD4cells
sapply(colnames(df.full)[1:9], function(x) ggplot(df.full %>% dplyr::filter(tissue=="RCC") %>% droplevels(), aes_string(x = "dataset", y = x)) + geom_boxplot() + geom_jitter(size=0.9, alpha=0.9,aes_string(x = "dataset", y = x, color="gender")), simplify = FALSE)
$TCRArichness.ds
$BCR.IGHrichness.ds
$TIS_sigscore
$PDL1
$TumorPurity
$logTMB
$Bcells
$CD8cells
$CD4cells
sapply(colnames(df.full)[1:9], function(x) ggplot(df.full %>% dplyr::filter(tissue=="RCC") %>% droplevels(), aes_string(x = "dataset", y = x)) + geom_boxplot() + geom_jitter(size=0.9, alpha=0.9,aes_string(x = "dataset", y = x, color="age")) + scale_color_viridis(alpha=0.6), simplify = FALSE)
$TCRArichness.ds
$BCR.IGHrichness.ds
$TIS_sigscore
$PDL1
$TumorPurity
$logTMB
$Bcells
$CD8cells
$CD4cells
df.full.rcc <- df.full %>% dplyr::filter(tissue=="RCC") %>% droplevels() %>% group_split(dataset)
# Which variables are affected?
df.full.rcc %>% map(is.na) %>% map(colSums)
[[1]] TCRArichness.ds BCR.IGHrichness.ds TIS_sigscore PDL1 0 0 0 0 TumorPurity logTMB Bcells CD8cells 0 16 0 0 CD4cells OS OS.time age 0 60 60 60 gender disease_status tissue dataset 16 6 0 0
[[2]] TCRArichness.ds BCR.IGHrichness.ds TIS_sigscore PDL1 0 0 0 0 TumorPurity logTMB Bcells CD8cells 0 13 0 0 CD4cells OS OS.time age 0 0 0 0 gender disease_status tissue dataset 0 28 0 0
df.full.rcc %>% map(gg_miss_var)
[[1]]
[[2]]
df.full.rcc %>% map(vis_miss) #%>% map(theme(axis.text.x = element_text(angle=80)))
[[1]]
[[2]]
df.full.rcc %>% map(gg_miss_upset)
[[1]]
[[2]]
sapply(colnames(df.full)[1:9], function(x) ggplot(df.full %>% dplyr::filter(tissue=="gastric") %>% droplevels(), aes_string(x = "dataset", y = x)) + geom_boxplot(), simplify = FALSE)
$TCRArichness.ds
$BCR.IGHrichness.ds
$TIS_sigscore
$PDL1
$TumorPurity
$logTMB
$Bcells
$CD8cells
$CD4cells
sapply(colnames(df.full)[1:9], function(x) ggplot(df.full %>% dplyr::filter(tissue=="gastric") %>% droplevels(), aes_string(x = "dataset", y = x)) + geom_boxplot() + geom_jitter(size=0.9, alpha=0.9,aes_string(x = "dataset", y = x, color="gender")), simplify = FALSE)
$TCRArichness.ds
$BCR.IGHrichness.ds
$TIS_sigscore
$PDL1
$TumorPurity
$logTMB
$Bcells
$CD8cells
$CD4cells
sapply(colnames(df.full)[1:9], function(x) ggplot(df.full %>% dplyr::filter(tissue=="gastric") %>% droplevels(), aes_string(x = "dataset", y = x)) + geom_boxplot() + geom_jitter(size=0.9, alpha=0.9,aes_string(x = "dataset", y = x, color="age")) + scale_color_viridis(alpha=0.6), simplify = FALSE)
$TCRArichness.ds
$BCR.IGHrichness.ds
$TIS_sigscore
$PDL1
$TumorPurity
$logTMB
$Bcells
$CD8cells
$CD4cells
# Which variables are affected?
df.full %>% dplyr::filter(tissue=="gastric") %>% droplevels() %>% is.na() %>% colSums() %>% pander()
| TCRArichness.ds | BCR.IGHrichness.ds | TIS_sigscore | PDL1 | TumorPurity |
|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 |
| logTMB | Bcells | CD8cells | CD4cells | OS | OS.time | age | gender |
|---|---|---|---|---|---|---|---|
| 43 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| disease_status | tissue | dataset |
|---|---|---|
| 0 | 0 | 0 |
# Get number of missings per participant (n and %)
gg_miss_var(df.full %>% dplyr::filter(tissue=="gastric") %>% droplevels())
vis_miss(df.full %>% dplyr::filter(tissue=="gastric") %>% droplevels()) + theme(axis.text.x = element_text(angle=80))
#gg_miss_upset(df.full %>% dplyr::filter(tissue=="gastric") %>% droplevels())
ggplot(df.full %>% dplyr::filter(tissue=="melanoma") %>% droplevels(), aes(x = factor(dataset), y = age)) + geom_boxplot()
# ggplot(df.full %>% dplyr::filter(tissue=="melanoma") %>% droplevels(), aes(x = factor(dataset), y = gender)) + geom_bar()
df.full %>% dplyr::filter(tissue=="melanoma") %>% droplevels() %>%
group_by(dataset) %>%
dplyr::count(gender) %>%
mutate(prop = n / sum(n)) %>%
ggplot(aes(x = dataset, y = prop, fill = gender)) +
geom_bar(stat = "identity") #+
#coord_flip()
# ggplot(df.full %>% dplyr::filter(tissue=="melanoma") %>% droplevels(), aes(x = factor(dataset), y = gender)) + geom_bar()
df.full %>% dplyr::filter(tissue=="melanoma") %>% droplevels() %>%
group_by(dataset) %>%
dplyr::count(disease_status) %>%
mutate(prop = n / sum(n)) %>%
ggplot(aes(x = dataset, y = prop, fill = disease_status)) +
geom_bar(stat = "identity") #+
#coord_flip()
# ggplot(df.full %>% dplyr::filter(tissue=="melanoma") %>% droplevels(), aes(x = factor(dataset), y = gender)) + geom_bar()
df.full %>% dplyr::filter(tissue=="bladder") %>% droplevels() %>%
group_by(dataset) %>%
dplyr::count(gender) %>%
mutate(prop = n / sum(n)) %>%
ggplot(aes(x = dataset, y = prop, fill = gender)) +
geom_bar(stat = "identity") #+
#coord_flip()
# ggplot(df.full %>% dplyr::filter(tissue=="melanoma") %>% droplevels(), aes(x = factor(dataset), y = gender)) + geom_bar()
df.full %>% dplyr::filter(tissue=="bladder") %>% droplevels() %>%
group_by(dataset) %>%
dplyr::count(disease_status) %>%
mutate(prop = n / sum(n)) %>%
ggplot(aes(x = dataset, y = prop, fill = disease_status)) +
geom_bar(stat = "identity") #+
#coord_flip()
ggplot(df.full %>% dplyr::filter(tissue=="RCC") %>% droplevels(), aes(x = factor(dataset), y = age)) + geom_boxplot()
# ggplot(df.full %>% dplyr::filter(tissue=="melanoma") %>% droplevels(), aes(x = factor(dataset), y = gender)) + geom_bar()
df.full %>% dplyr::filter(tissue=="RCC") %>% droplevels() %>%
group_by(dataset) %>%
dplyr::count(gender) %>%
mutate(prop = n / sum(n)) %>%
ggplot(aes(x = dataset, y = prop, fill = gender)) +
geom_bar(stat = "identity") #+
#coord_flip()
# ggplot(df.full %>% dplyr::filter(tissue=="melanoma") %>% droplevels(), aes(x = factor(dataset), y = gender)) + geom_bar()
df.full %>% dplyr::filter(tissue=="RCC") %>% droplevels() %>%
group_by(dataset) %>%
dplyr::count(disease_status) %>%
mutate(prop = n / sum(n)) %>%
ggplot(aes(x = dataset, y = prop, fill = disease_status)) +
geom_bar(stat = "identity") #+
#coord_flip()
tissue
1 bladder 2 gastric 3 melanoma 4 RCC
events <-sapply(test, function(x) x %>% dplyr::count(OS) %>% filter(OS==1) %>% pull(n))
names(events) <-group_keys(ir) %>% pull(tissue)
pander(events)
| bladder | gastric | melanoma | RCC |
|---|---|---|---|
| 180 | 23 | 118 | 19 |
Now to get a number on how many events per covariate:
# Divide by the minimum of covariates
pander(events/9)
| bladder | gastric | melanoma | RCC |
|---|---|---|---|
| 20 | 2.556 | 13.11 | 2.111 |
Check also how many events and cencored
df.full %>%
group_by(tissue) %>%
dplyr::count(OS) %>%
mutate(prop = n / sum(n)) %>%
ggplot(aes(x = tissue, y = prop, fill = OS)) +
geom_bar(stat = "identity") + theme(axis.text.x = element_text(angle=45))
df.full %>%
group_by(dataset) %>%
dplyr::count(OS) %>%
mutate(prop = n / sum(n)) %>%
ggplot(aes(x = dataset, y = prop, fill = OS)) +
geom_bar(stat = "identity") + theme(axis.text.x = element_text(angle=45))
Even if we find that MAR is covered (per histology), we still have a problem imputing missing values across different datasets that make up the tissue cohorts. We are using multicenter data, data coming from different trials and the bias can be much more from what we see.
What is missing from where:
melMis <-ggarrange(plotlist=misPlots, common.legend = TRUE,legend="none", labels=c("Riaz et al.","Hugo et al.","Gide et al.","Liu et al.","Rodriguez et al."),font.label = list(size = 15, color = "black", face = "bold", family = "Palatino Linotype")
)
melMis
bladMis <- ggarrange(plotlist=misPlots, common.legend = TRUE,legend="none", labels=c("Mariathasan et al.","Powles et al."),font.label = list(size = 15, color = "black", face = "bold", family = "Palatino Linotype"))
bladMis
rccMis <-ggarrange(plotlist=misPlots, common.legend = TRUE,legend="none", labels=c("McDermott et al.","Miao et al."),font.label = list(size = 15, color = "black", face = "bold", family = "Palatino Linotype"))
rccMis
ONE DATASET USED FOR SURVIVAL
gasMis<-ggarrange(plotlist=misPlots, common.legend = TRUE,legend="bottom", labels=c("Kim et al."),font.label = list(size = 15, color = "black", face = "bold", family = "Palatino Linotype"))
gasMis
Gastric does not have TMB data at all
Conclusion
Imputation is no worth it and not consistently possible for our analyses.
# library(cowplot)
p <- ggdraw() +
draw_plot(melMis,x = 0, y = 0.6, width = .98, height = .38) +
draw_plot(rccMis,x = 0, y = 0.41, width = .68, height = .18) +
draw_plot(bladMis,x = 0, y = 0.22, width = .8, height =.18)+
draw_plot(gasMis,x = 0, y = 0, width = .333, height = .22)+
draw_plot_label(label = c("A: SKCM anti-PD-1/L1",
"B: KIRC anti-PD-1/L1",
"C: BLCA anti-PD-1/L1",
"D: STAD anti-PD-1/L1"), size = 16,
x = c(-0.04,-0.038,-0.038,-0.038), y = c(.99,.6,.41,.23),family="Palatino Linotype")
p
cairo_pdf(filename = paste0(figuresPath,"Supplemental Figure S2",".pdf"),family = "Palatino Linotype", width = 22, height=18)
print(p)
dev.off()
png 2
# ggsave(p, filename = paste0(figuresPath,"Supplemental Figure S2.tiff"), dpi = 600,width = 22,
# height = 22, units = "in")
session_info <- sessionInfo()
writeLines(capture.output(session_info), paste0(sessionInfoPath,"E_07_MissingData.txt"))
sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19045)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] parallel grid stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] dichromat_2.0-0.1 ggthemes_4.2.4 cowplot_1.1.1
## [4] viridis_0.6.2 viridisLite_0.4.1 naniar_0.6.1
## [7] ggsci_2.9 flextable_0.7.0 ggplotify_0.1.0
## [10] survminer_0.4.9 reconPlots_0.1.0 rlist_0.4.6.2
## [13] pander_0.6.5 RColorBrewer_1.1-3 precrec_0.12.9
## [16] scales_1.2.0 doParallel_1.0.17 iterators_1.0.14
## [19] foreach_1.5.2 factoextra_1.0.7 OutlierDetection_0.1.1
## [22] FactoMineR_2.4 DataExplorer_0.8.2 DMwR_0.1.0
## [25] cluster_2.1.2 abind_1.4-5 rpart_4.1.16
## [28] class_7.3-20 ROCR_1.0-11 quantmod_0.4.20
## [31] TTR_0.24.3 xts_0.12.1 zoo_1.8-10
## [34] caret_6.0-92 Amelia_1.8.0 Rcpp_1.0.9
## [37] mlbench_2.1-3 ggpubr_0.4.0 hrbrthemes_0.8.0
## [40] Hmisc_4.7-0 Formula_1.2-4 survival_3.3-1
## [43] lattice_0.20-45 strex_1.4.2 data.table_1.14.2
## [46] forcats_0.5.1 purrr_0.3.4 readr_2.1.2
## [49] tidyr_1.2.0 ggplot2_3.3.6 tidyverse_1.3.1
## [52] tibble_3.1.7 plyr_1.8.7 dplyr_1.0.9
## [55] stringr_1.4.0 DT_0.22 extrafont_0.18
## [58] pacman_0.5.1
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.2 tidyselect_1.1.2 htmlwidgets_1.5.4
## [4] pROC_1.18.0 munsell_0.5.0 codetools_0.2-18
## [7] interp_1.1-3 future_1.26.1 withr_2.5.0
## [10] spatstat.random_2.2-0 colorspace_2.0-3 highr_0.9
## [13] uuid_1.1-0 knitr_1.41 rstudioapi_0.13
## [16] leaps_3.1 stats4_4.1.0 officer_0.4.4
## [19] ggsignif_0.6.3 tensor_1.5 Rttf2pt1_1.3.10
## [22] listenv_0.8.0 labeling_0.4.2 KMsurv_0.1-5
## [25] mnormt_2.0.2 polyclip_1.10-0 farver_2.1.1
## [28] depth_2.1-1.1 parallelly_1.32.1 vctrs_0.4.1
## [31] generics_0.1.3 circular_0.4-95 ipred_0.9-13
## [34] xfun_0.35 R6_2.5.1 gridGraphics_0.5-1
## [37] spatstat.utils_2.3-1 assertthat_0.2.1 networkD3_0.4
## [40] nnet_7.3-17 gtable_0.3.1 globals_0.15.1
## [43] goftest_1.2-3 timeDate_4021.104 rlang_1.1.3
## [46] systemfonts_1.0.4 scatterplot3d_0.3-41 splines_4.1.0
## [49] rstatix_0.7.0 extrafontdb_1.0 lazyeval_0.2.2
## [52] ModelMetrics_1.2.2.2 spatstat.geom_2.4-0 broom_1.0.1
## [55] checkmate_2.1.0 rgl_0.108.3 yaml_2.3.6
## [58] reshape2_1.4.4 modelr_0.1.8 crosstalk_1.2.0
## [61] backports_1.4.1 tools_4.1.0 lava_1.6.10
## [64] ellipsis_0.3.2 spatstat.core_2.4-2 jquerylib_0.1.4
## [67] base64enc_0.1-3 deldir_1.0-6 haven_2.5.1
## [70] ggrepel_0.9.1 fs_1.5.2 magrittr_2.0.3
## [73] reprex_2.0.1 RANN_2.6.1 tmvnsim_1.0-2
## [76] mvtnorm_1.1-3 xtable_1.8-4 hms_1.1.2
## [79] evaluate_0.18 jpeg_0.1-9 readxl_1.4.0
## [82] gridExtra_2.3 compiler_4.1.0 crayon_1.5.2
## [85] htmltools_0.5.3 mgcv_1.8-40 tzdb_0.3.0
## [88] visdat_0.5.3 lubridate_1.8.0 DBI_1.1.2
## [91] dbplyr_2.1.1 MASS_7.3-57 boot_1.3-28
## [94] Matrix_1.4-0 car_3.1-1 cli_3.6.2
## [97] gower_1.0.0 igraph_1.3.1 km.ci_0.5-6
## [100] pkgconfig_2.0.3 flashClust_1.01-2 foreign_0.8-82
## [103] plotly_4.10.0 spatstat.sparse_2.1-1 recipes_1.0.1
## [106] xml2_1.3.3 ldbod_0.1.2 bslib_0.3.1
## [109] hardhat_1.2.0 prodlim_2019.11.13 rvest_1.0.2
## [112] yulab.utils_0.0.4 digest_0.6.29 spatstat.data_2.2-0
## [115] rmarkdown_2.14 cellranger_1.1.0 survMisc_0.5.6
## [118] htmlTable_2.4.1 gdtools_0.2.4 curl_5.2.0
## [121] lifecycle_1.0.1 nlme_3.1-157 jsonlite_1.8.3
## [124] carData_3.0-5 fansi_1.0.3 pillar_1.8.0
## [127] spatstat.linnet_2.3-2 fastmap_1.1.0 httr_1.4.7
## [130] glue_1.6.2 UpSetR_1.4.0 zip_2.2.0
## [133] spatstat_2.3-4 png_0.1-7 stringi_1.7.8
## [136] sass_0.4.1 latticeExtra_0.6-30 future.apply_1.9.0